library("readr")
library("igraph")
library("dplyr")
library("stringr")
library("scales")
library("textreuse")
source("R/helper.R")
source("R/section-matches.R")

Read the data.

load("cache/corpus-lsh.rda")

Network graphs

Create a network graph based on section percentages.

edges_pct <- summary_matches %>% 
  filter(percent_borrowed >= 0.05,
         !is.na(match_code)) %>% 
  select(borrower_code, match_code, weight = percent_borrowed) %>% 
  group_by(borrower_code) %>% 
  top_n(2, weight)
edges_pct
## Source: local data frame [136 x 3]
## Groups: borrower_code [83]
## 
##    borrower_code match_code weight
##            (chr)      (chr)  (dbl)
## 1         AK1900     OR1862 0.5937
## 2         AR1868     KY1851 0.3634
## 3         AR1868     KY1854 0.3158
## 4         AR1874     AR1868 0.6752
## 5         AR1874     KY1851 0.0818
## 6         AZ1865     CA1851 0.5476
## 7         AZ1865     CA1858 0.2590
## 8         AZ1887     CA1872 0.4313
## 9         CA1850     NY1849 0.2972
## 10        CA1850     NY1850 0.1207
## ..           ...        ...    ...
g <- graph_from_data_frame(edges_pct, directed = TRUE) 
nodes <- distances(g, to = "NY1850", algorithm = "unweighted") %>% as.data.frame() %>% 
  add_rownames() %>% 
  rename(name = rowname, distance = NY1850) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))
## Warning in distances(g, to = "NY1850", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes[nodes$name == "NY1848", "color"] <- "red"
nodes[nodes$name == "NY1849", "color"] <- "red"
nodes[nodes$name == "NY1850", "color"] <- "red"
nodes[nodes$name == "NY1851", "color"] <- "red"
g <- graph_from_data_frame(edges_pct, directed = TRUE, vertices = nodes) 
V(g)$year <- V(g)$name %>% extract_date()
set.seed(4221)

g <- add_layout_(g, with_graphopt(niter = 4000, spring.length = 25), normalize())

plot_before_year <- function(x, year) {
  x_before <- induced.subgraph(x, which(V(x)$year <= year))
  n <- V(x)$name
  n_before <- V(x_before)$name
  filter <- n %in% n_before
  x_before$layout <- x_before$layout[filter, ]
  par(mar = c(0,0,1,0))
  plot(x_before, edge.width = E(x_before)$weight * 8,
       edge.arrow.size = 0.0, vertex.size = 5)
  title(paste0("Codes of Civil Procedure before ", year))
} 

for (i in seq(1850, 1900, 5)) {
  plot_before_year(g, i)
}

Create a graph based on numbers (not percentages) of sections shared. Notice that we are keeping only code to code matches that share a certain number of sections (minimum_n), we are keeping only a certain number of matches for each code (top_matches), and we are omitting codes that aren’t part of the main network.

minimum_n <- 20
top_matches <- 2
codes_not_to_plot <- c(
  # "CO868", 
  # "CT1879", 
  # "FL1847", 
  # "FL1892", 
  "GA1851",
  "GA1860", 
  "HI1859", 
  "HI1897",
  # "IL1866", 
  "LA1825", 
  "LA1844" 
#   "MS1848", 
#   "MS1857", 
  # "NY1876", 
  # "NY1879",
  # "VA1860", 
  # "VA1893", 
  # "WV1868" 
  )
# codes_not_to_plot <- NULL

edges_n <- summary_matches %>% 
  filter(!is.na(match_code),
         sections_borrowed >= minimum_n) %>%  
  select(borrower_code, match_code, weight = sections_borrowed) %>% 
  group_by(borrower_code) %>% 
  top_n(top_matches, weight) %>% 
  ungroup() %>% 
  mutate(weight = rescale(weight)) %>% 
  filter(!borrower_code %in% codes_not_to_plot,
         !match_code %in% codes_not_to_plot)
edges_n
## Source: local data frame [146 x 3]
## 
##    borrower_code match_code      weight
##            (chr)      (chr)       (dbl)
## 1         AK1900     OR1862 0.398192771
## 2         AK1900     NY1850 0.002409639
## 3         AK1900     OR1854 0.002409639
## 4         AR1868     KY1851 0.209036145
## 5         AR1868     KY1854 0.180120482
## 6         AR1874     AR1868 0.146987952
## 7         AR1874     KY1851 0.007228916
## 8         AZ1865     CA1851 0.199397590
## 9         AZ1865     CA1858 0.087951807
## 10        AZ1887     CA1872 0.203614458
## ..           ...        ...         ...
g_n <- graph_from_data_frame(edges_n, directed = TRUE) 
node_distances <- distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"),
                     algorithm = "unweighted") %>% 
                     apply(1, min, na.rm = TRUE)
## Warning in distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"), :
## Unweighted algorithm chosen, weights ignored
nodes_n <- data_frame(name = names(node_distances), distance = node_distances) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))

g_n <- graph_from_data_frame(edges_n, directed = TRUE, vertices = nodes_n) 
V(g_n)$year <- V(g_n)$name %>% extract_date()

edge_size_clamp <- function(g, multiplier = 20, max_val = 6, min_val = 1) {
  w <- E(g)$weight * multiplier
  w[w > max_val] <- max_val
  w[w < min_val] <- min_val
  w
}

set.seed(4221)
g_n <- g_n %>% add_layout_(with_graphopt(niter = 4000, spring.length = 25),
                           normalize())
par(mar = c(0,0,1,0))
plot(g_n, edge.width = edge_size_clamp(g_n), edge.arrow.size = 0, vertex.size = 5)
title("Borrowings between codes, number of sections borrowed")

Now do a state to state network:

min_state_borrowings <- 100
top_matches <- 2
edges_states <- summary_matches %>% 
  mutate(borrower_date = extract_date(borrower_code),
         match_date = extract_date(match_code),
         borrower_state = extract_state(borrower_code),
         match_state = extract_state(match_code)) %>% 
  filter(!is.na(match_code),
         borrower_date >= match_date,
         borrower_state != match_state) %>% 
  group_by(borrower_state, match_state) %>% 
  summarize(n = sum(sections_borrowed)) %>% 
  filter(n >= min_state_borrowings) %>% 
  select(borrower_state, match_state, weight = n) %>% 
  group_by(borrower_state) %>% 
  top_n(top_matches, weight) %>% 
  ungroup() %>% 
  mutate(weight = rescale(weight))

edges_states
## Source: local data frame [44 x 3]
## 
##    borrower_state match_state     weight
##             (chr)       (chr)      (dbl)
## 1              AK          OR 0.57655039
## 2              AR          KY 0.60562016
## 3              AZ          CA 0.75484496
## 4              CA          NY 0.69670543
## 5              CO          CA 0.05910853
## 6              CO          IL 0.04166667
## 7              DC          IN 0.11821705
## 8              DT          ND 0.22674419
## 9              DT          NE 0.19476744
## 10             FL          NY 0.03488372
## ..            ...         ...        ...
g_states <- graph_from_data_frame(edges_states, directed = TRUE)

state_distances <- distances(g_states, to = "NY", algorithm = "unweighted") 
## Warning in distances(g_states, to = "NY", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes_states <- data_frame(name = rownames(state_distances),
                           distance = state_distances[, 1]) %>% 
  mutate(color = ifelse(distance == 0, "red",
                        ifelse(distance == 1, "green",
                               ifelse(distance == 2, "yellow", "lightblue"))))

g_states <- graph_from_data_frame(edges_states, directed = TRUE,
                                  vertices = nodes_states) %>% 
  decompose(min.vertices = 3) %>% 
  `[[`(1)

set.seed(4221)
g_states <- g_states %>% add_layout_(with_graphopt(niter = 4000,
                                                   spring.length = 25),
                                     normalize())
par(mar = c(0,0,1,0))
plot(g_states, 
     edge.width = edge_size_clamp(g_n), edge.arrow.size = 0.5,
     edge.arrow.mode = 1,
     vertex.size = 5, vertex.label.dist = 0.85, vertex.label.degree = pi)
title("Borrowings between states, number of sections borrowed")